# load packages, installing if missing
if (!require(librarian)){
install.packages("librarian")
library(librarian)
}
## Loading required package: librarian
librarian::shelf(
dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr, geojsonio, tidyverse)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
## Warning: multiple methods tables found for 'crop'
## Warning: multiple methods tables found for 'extend'
select <- dplyr::select # overwrite raster::select
# set random seed for reproducibility
set.seed(42)
# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F)
I have chosen the giraffe, Giraffa, as my species for this lab. I have always loved large mammals and giraffes are just especially majestic and interesting animals.
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- TRUE
if (!file.exists(obs_geo) | redo){
# get species occurrence data from GBIF with coordinates
(res <- spocc::occ(
query = 'Giraffa',
from = 'gbif', has_coords = T, limit = 10000))
# extract data frame from result
#df <- res$gbif$data[[1]] # from Ben
df <- res$gbif$data[[1]] %>%
filter(longitude > -26.41,
longitude < 54.98,
latitude > -36.21,
latitude < 38.65) # filter for just the African continent
readr::write_csv(df, obs_csv)
# convert to points of observation from lon/lat columns in data frame
obs <- df %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9225
# show points on map
mapview::mapview(obs, map.types = "Stamen.Terrain")
Question 1. How many observations total are in GBIF for your species? (Hint: ?occ) There are 9,225 giraffe observations in GBIF, filtering for observations on the African continent only. Before filtering, there were 9,300 observations total in GBIF.
Question 2. Do you see any odd observations, like marine species on land or vice versa? If so, please see the Data Cleaning and explain what you did to fix or remove these points. There are no odd observations, and all of the points are on land; however, I did decide to filter my observations to those appearing only on the African continent. The observations were most abundant here to start with anyways, and since they are native to Africa I felt that it was a good idea to build the species distribution model based on their native habitat. To filter the observations, I included observations that fell within a bounding box of the African continent.
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_climaticMoistureIndex", "ER_topoWet")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)
Question: What environmental layers did you choose as predictors? Can you find any support for these in the literature? I decided to use the following layers as predictors: WC_alt, WC_bio1, WC_bio2, ER_topoWet, and ER_climaticMoistureIndex. Because giraffes are typically found at low altitudes, in arid environments like grasslands and savannas, I thought the climatic moisture index and altitude layer were important to include. Additionally, I wanted to include WC_bio1 and WC_bio2 to see if temperature played a role in determining the giraffe habitat since they prefer to live in warmer envrieonments. Lastly, I included the ER_topoWet layer because giraffes typically prefer dry climates and I thought this layer would be a good predictor.
obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
if (!file.exists(obs_hull_geo) | redo){
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, obs_hull_geo)
}
## Warning in CPL_write_ogr(obj, dsn, layer, driver,
## as.character(dataset_options), : GDAL Error 6: DeleteLayer() not supported by
## this dataset.
obs_hull <- read_sf(obs_hull_geo)
# show points on map
mapview(
list(obs, obs_hull))
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(env_stack_grd) | redo){
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
}
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
if (!file.exists(absence_geo) | redo){
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
# show map
# mapview(obs) +
# mapview(r_obs)
# create mask for
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
# generate random points inside mask
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(pts_env_csv) | redo){
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
## Rows: 18450 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): ID, present, WC_alt, WC_bio1, WC_bio2, ER_climaticMoistureIndex, ER...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(
rownames = F,
options = list(
dom = "t",
pageLength = 20))
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
## Warning: Removed 134 rows containing non-finite values (stat_density).
librarian::shelf(
DT, dplyr, dismo, GGally, here, readr, tidyr)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 18450
datatable(pts_env, rownames = F)
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 30 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 30 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 30 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 30 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 30 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 26 rows containing missing values
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 26 rows containing missing values (geom_point).
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 18420
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0318 -0.3210 0.1806 0.3142 1.2636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.195e-01 4.290e-02 -7.449 9.83e-14 ***
## WC_alt 2.205e-04 9.873e-06 22.333 < 2e-16 ***
## WC_bio1 3.099e-02 1.418e-03 21.855 < 2e-16 ***
## WC_bio2 -6.193e-04 2.010e-03 -0.308 0.758
## ER_climaticMoistureIndex -2.616e-01 1.579e-02 -16.563 < 2e-16 ***
## ER_topoWet -4.143e-02 2.799e-03 -14.799 < 2e-16 ***
## lon 1.226e-02 2.777e-04 44.155 < 2e-16 ***
## lat -1.161e-02 2.225e-04 -52.202 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4077 on 18412 degrees of freedom
## Multiple R-squared: 0.3354, Adjusted R-squared: 0.3352
## F-statistic: 1327 on 7 and 18412 DF, p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.4561073 1.0318550
range(y_true)
## [1] 0 1
# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8081 -0.6788 -0.0359 0.8206 3.2475
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.479e+00 3.423e-01 -24.767 <2e-16 ***
## WC_alt 1.771e-03 7.431e-05 23.830 <2e-16 ***
## WC_bio1 3.302e-01 1.357e-02 24.333 <2e-16 ***
## WC_bio2 -9.404e-03 1.229e-02 -0.765 0.444
## ER_climaticMoistureIndex -1.669e+00 1.047e-01 -15.943 <2e-16 ***
## ER_topoWet -2.978e-01 1.795e-02 -16.590 <2e-16 ***
## lon 8.227e-02 2.105e-03 39.091 <2e-16 ***
## lat -8.317e-02 2.073e-03 -40.125 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25536 on 18419 degrees of freedom
## Residual deviance: 17897 on 18412 degrees of freedom
## AIC: 17913
##
## Number of Fisher Scoring iterations: 5
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 0.0003992854 0.9806031792
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim = "free")
librarian::shelf(mgcv)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_climaticMoistureIndex) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_climaticMoistureIndex) +
## s(ER_topoWet) + s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.787 0.172 -10.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 6.986 7.977 206.0 <2e-16 ***
## s(WC_bio1) 8.738 8.956 325.1 <2e-16 ***
## s(WC_bio2) 8.502 8.910 246.5 <2e-16 ***
## s(ER_climaticMoistureIndex) 8.758 8.951 656.8 <2e-16 ***
## s(ER_topoWet) 8.836 8.990 188.8 <2e-16 ***
## s(lon) 8.990 9.000 490.1 <2e-16 ***
## s(lat) 8.836 8.987 1216.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.72 Deviance explained = 66.3%
## UBRE = -0.52588 Scale est. = 1 n = 18420
# show term plots
plot(mdl, scale=0)
Question: Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)?
Altitude above about 2000, Bio_1 above about 26, Bio_2 above about 17, ER_topo_Wet under 9, and latitude from about -40 to -20 seem to contribute most towards presence. On the flip side, Bio_1 from about 0-20, ER_climaticMoistureIndex from -1 and above, ER_topoWet above 14, and longitude froom about -7 to 0 seem to contribute the most to absense.
# load extra packages
librarian::shelf(
maptools, sf)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")
# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
mdl <- maxent(env_stack, obs_sp)
readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl)
# plot term plots
response(mdl)
Question: Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results?
ER_climaticMoistureIndex and WC_bio1 seem to contribute the most towards giraffe presence in the maxent model. This is different from the GAM results because while it seemed like ER_climaticMoistureIndex did influence presence, it was not as obvious how much it contributed to presence.
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
# Lab 1c. Species Distribution Modeling - Decision Trees
# global knitr chunk options
knitr::opts_chunk$set(
warning = FALSE,
message = FALSE)
# load packages
librarian::shelf(
caret, # m: modeling framework
dplyr, ggplot2 ,here, readr,
pdp, # X: partial dependence plots
rpart, # m: recursive partition modeling
rpart.plot, # m: recursive partition plotting
rsample, # d: split train/test data
skimr, # d: skim summarize data table
vip) # X: variable importance
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# options
options(
scipen = 999,
readr.show_col_types = F)
set.seed(42)
# graphical theme
ggplot2::theme_set(ggplot2::theme_light())
# paths
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(
present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(d)
| Name | d |
| Number of rows | 18420 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 0: 9222, 1: 9198 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_alt | 0 | 1 | 811.57 | 515.77 | -132.00 | 368.00 | 725.00 | 1209.00 | 3511.00 | ▇▇▂▁▁ |
| WC_bio1 | 0 | 1 | 22.87 | 3.46 | 7.40 | 21.00 | 22.30 | 25.20 | 30.20 | ▁▁▃▇▃ |
| WC_bio2 | 0 | 1 | 13.96 | 2.27 | 5.30 | 12.60 | 14.00 | 15.80 | 20.70 | ▁▂▇▇▁ |
| ER_climaticMoistureIndex | 0 | 1 | -0.58 | 0.30 | -1.00 | -0.79 | -0.61 | -0.34 | 0.60 | ▇▆▃▁▁ |
| ER_topoWet | 0 | 1 | 12.37 | 1.39 | 7.25 | 11.27 | 12.43 | 13.22 | 15.80 | ▁▂▇▇▂ |
| lon | 0 | 1 | 24.20 | 13.12 | -16.71 | 16.28 | 29.04 | 34.30 | 40.58 | ▁▂▂▃▇ |
| lat | 0 | 1 | -0.27 | 16.39 | -34.16 | -12.91 | -0.61 | 12.21 | 38.21 | ▃▃▇▃▂ |
# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1
table(d$present)
##
## 0 1
## 9222 9198
table(d_train$present)
##
## 0 1
## 7377 7358
# run decision stump model
mdl <- rpart(
present ~ ., data = d_train,
control = list(
cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 14735
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 14735 7358 0 (0.50064472 0.49935528)
## 2) lat>=4.34595 5072 504 0 (0.90063091 0.09936909) *
## 3) lat< 4.34595 9663 2809 1 (0.29069647 0.70930353) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)
## 2.2 Partition, depth=default
# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 14735
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 14735 7358 0 (0.50064472 0.49935528)
## 2) lat>=4.34595 5072 504 0 (0.90063091 0.09936909)
## 4) WC_bio1< 28.85 4326 83 0 (0.98081368 0.01918632) *
## 5) WC_bio1>=28.85 746 325 1 (0.43565684 0.56434316)
## 10) ER_climaticMoistureIndex< -0.795 326 19 0 (0.94171779 0.05828221) *
## 11) ER_climaticMoistureIndex>=-0.795 420 18 1 (0.04285714 0.95714286) *
## 3) lat< 4.34595 9663 2809 1 (0.29069647 0.70930353)
## 6) lon< 30.79615 3545 1430 0 (0.59661495 0.40338505)
## 12) lat>=-15.38952 1379 107 0 (0.92240754 0.07759246) *
## 13) lat< -15.38952 2166 843 1 (0.38919668 0.61080332)
## 26) WC_bio1< 21.35 1223 560 0 (0.54210957 0.45789043)
## 52) ER_climaticMoistureIndex< -0.675 701 227 0 (0.67617689 0.32382311) *
## 53) ER_climaticMoistureIndex>=-0.675 522 189 1 (0.36206897 0.63793103) *
## 27) WC_bio1>=21.35 943 180 1 (0.19088017 0.80911983) *
## 7) lon>=30.79615 6118 694 1 (0.11343576 0.88656424) *
rpart.plot(mdl)
# plot complexity parameter
plotcp(mdl)
# rpart cross validation results
mdl$cptable
## CP nsplit rel error xerror xstd
## 1 0.54974178 0 1.0000000 1.0231041 0.008246738
## 2 0.09309595 1 0.4502582 0.4506659 0.006889480
## 3 0.06523512 2 0.3571623 0.3585213 0.006324721
## 4 0.02609405 3 0.2919272 0.2943735 0.005841774
## 5 0.01678445 5 0.2397391 0.2425931 0.005382935
## 6 0.01000000 7 0.2061702 0.2144605 0.005101491
Question: Based on the complexity plot threshold, what size of tree is recommended?
A size of 8 is recommended based on the complexity plot threshold.
# caret cross validation results
mdl_caret <- train(
present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
vip(mdl_caret, num_features = 40, bar = FALSE)
Question: what are the top 3 most important variables of your model?
Based on the plot above, lat, lon, and ER_climaticMoistureIndex are the most important variables of my model.
# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
colorkey = TRUE, screen = list(z = -20, x = -60))
# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
# load additional packages
librarian::shelf(
ranger) # random forest modeling
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1737821
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
present ~ ., data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
present ~ ., data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)
Question: How might variable importance differ between rpart and RandomForest in your model outputs?
Besides lat and lon, WC_bio1 and ER_climaticMoistureIndex are the two most important variables in both the rpart model and the random forest model. The bottom three variables are also the same in both models, with ER_topoWet being the least important variable.
# global knitr chunk options
knitr::opts_chunk$set(
warning = FALSE,
message = FALSE)
# load packages
librarian::shelf(
dismo, # species distribution modeling: maxent(), predict(), evaluate(),
dplyr, ggplot2, GGally, here, maptools, readr,
raster, readr, rsample, sf,
usdm) # uncertainty analysis for species distribution models: vifcor()
select = dplyr::select
# options
set.seed(42)
options(
scipen = 999,
readr.show_col_types = F)
ggplot2::theme_set(ggplot2::theme_light())
# paths
dir_data <- here("data/sdm")
pts_geo <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds <- file.path(dir_data, "mdl_maxent_vif.rds")
# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)
# read raster stack of environment
env_stack <- raster::stack(env_stack_grd)
# create training set with 80% of full data
pts_split <- rsample::initial_split(
pts, prop = 0.8, strata = "present")
pts_train <- rsample::training(pts_split)
pts_test <- rsample::testing(pts_split)
pts_train_p <- pts_train %>%
filter(present == 1) %>%
as_Spatial()
pts_train_a <- pts_train %>%
filter(present == 0) %>%
as_Spatial()
# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)
# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
## Variables VIF
## 1 WC_alt 2.289819
## 2 WC_bio1 2.012931
## 3 WC_bio2 2.695058
## 4 ER_climaticMoistureIndex 2.522713
## 5 ER_topoWet 1.689862
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7)
v
## No variable from the 5 input variables has collinearity problem.
##
## The linear correlation coefficients ranges between:
## min correlation ( WC_bio2 ~ WC_bio1 ): 0.06730115
## max correlation ( ER_climaticMoistureIndex ~ WC_bio2 ): -0.698806
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 WC_alt 2.290098
## 2 WC_bio1 2.001709
## 3 WC_bio2 2.620283
## 4 ER_climaticMoistureIndex 2.511373
## 5 ER_topoWet 1.646661
# reduce enviromental raster stack by
env_stack_v <- usdm::exclude(env_stack, v)
# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)
# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds)
# plot variable contributions per predictor
plot(mdl_maxv)
Question: Which variables were removed due to multicollinearity and what is the rank of most to least important remaining variables in your model?
None of my variables had colliniarity problems, so none were removed. ER_climaticMoistureIndex was the most important, followed by WC_bio1, WC_alt, ER_topoWet, and WC_bio2. The bottom three variables all just barely contributed.
# plot term plots
response(mdl_maxv)
# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')
plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
pts_test_p <- pts_test %>%
filter(present == 1) %>%
as_Spatial()
pts_test_a <- pts_test %>%
filter(present == 0) %>%
as_Spatial()
y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)
e <- dismo::evaluate(
p = pts_test_p,
a = pts_test_a,
model = mdl_maxv,
x = env_stack)
e
## class : ModelEvaluation
## n presences : 1841
## n absences : 1845
## AUC : 0.8618393
## cor : 0.6226259
## max TPR+TNR at : 0.6428303
plot(e, 'ROC')
thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6428303
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)
# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
matrix(
c(tpr, fnr,
fpr, tnr),
nrow=2, dimnames = list(
c("present_obs", "absent_obs"),
c("present_pred", "absent_pred")))
## present_pred absent_pred
## present_obs 0.92178164 0.2802168
## absent_obs 0.07821836 0.7197832
# add point to ROC plot
plot(e, 'ROC')
points(fpr, tpr, pch=23, bg="blue")
plot(y_maxv > thr)